//Section 1 - Introduction: Introduce your general and specific research question(s). What is the general problem area that this analysis contributes to? What specific problem are you trying to solve? Why is this important? Why is this hard? What does theory / prior work tell us about this problem and how are you extending it? What is your approach?
//Section 2 - Data overview: Describe your data at a high level, answering questions such as what are the entities in the data, how many entities are there, and what are the features or feature families relevant to the problem you’re tackling?
library(tidyverse)
library(tidymodels)
library(tidytext)
library(textrecipes)
library(here)
library(moderndive)
theme_set(theme_minimal())
scotblue <- "#0065BF"
ukred <- "#D00C27"
data <- read_csv('Data/Data.csv', show_col_types = FALSE)
New names:
• `` -> `...1`
data_acl <- data %>% filter(conference == 'acl_2017')
data_conll <- data %>% filter(conference == 'conll_2016')
data_acl <- data %>% filter(conference == 'acl_2017')
data_conll <- data %>% filter(conference == 'conll_2016')
set.seed(1234)
data_splot <- initial_split(data, strata = RECOMMENDATION)
train <- training(data_splot)
test <- testing(data_splot)
confint(RECOMMENDATION_model)
2.5 % 97.5 %
(Intercept) -1.972593146 0.44889629
IMPACT -0.149811797 0.21050962
SUBSTANCE 0.396858776 0.63581070
APPROPRIATENESS -0.039465619 0.40327408
MEANINGFUL_COMPARISON 0.001246276 0.22906417
SOUNDNESS_CORRECTNESS -0.091965316 0.13919052
ORIGINALITY -0.098122069 0.14034219
CLARITY 0.115166033 0.32227939
REVIEWER_CONFIDENCE -0.163537596 0.06747741
model_points <- get_regression_points(RECOMMENDATION_model)
ggplot(score_model_points, aes(x = RECOMMENDATION_hat)) +
geom_histogram(bins = 20) +
labs(x = "Residual", y = "Count")
ggplot(model_points, aes(x = SUBSTANCE, y = residual)) +
geom_point() +
labs(x = "SUBSTANCE", y = "RECOMMENDATION_hat")
set.seed(1234)
data_split <- initial_split(data_acl, strata = RECOMMENDATION)
train_acl <- training(data_split)
test_acl <- testing(data_split)
set.seed(1234)
data_split <- initial_split(data_conll, strata = RECOMMENDATION)
train_conll <- training(data_split)
test_conll <- testing(data_split)
RECOMMENDATION_model <- lm(RECOMMENDATION ~ IMPACT+SUBSTANCE+APPROPRIATENESS+MEANINGFUL_COMPARISON+SOUNDNESS_CORRECTNESS+
ORIGINALITY+CLARITY+REVIEWER_CONFIDENCE
,data = train_acl)
RECOMMENDATION_model
Call:
lm(formula = RECOMMENDATION ~ IMPACT + SUBSTANCE + APPROPRIATENESS +
MEANINGFUL_COMPARISON + SOUNDNESS_CORRECTNESS + ORIGINALITY +
CLARITY + REVIEWER_CONFIDENCE, data = train_acl)
Coefficients:
(Intercept) IMPACT SUBSTANCE
-0.83341 -0.01740 0.55182
APPROPRIATENESS MEANINGFUL_COMPARISON SOUNDNESS_CORRECTNESS
0.13264 0.12511 -0.03872
ORIGINALITY CLARITY REVIEWER_CONFIDENCE
0.09463 0.19268 0.05205
RECOMMENDATION_model_points <- get_regression_points(RECOMMENDATION_model)
model_points <- get_regression_points(RECOMMENDATION_model)
ggplot(model_points, aes(x = RECOMMENDATION_hat)) +
geom_histogram(bins = 20) +
labs(x = "Residual", y = "Count")
RECOMMENDATION_model <- lm(RECOMMENDATION ~ IMPACT+SUBSTANCE+APPROPRIATENESS+MEANINGFUL_COMPARISON+SOUNDNESS_CORRECTNESS+
ORIGINALITY+CLARITY+REVIEWER_CONFIDENCE
,data = train_conll)
RECOMMENDATION_model
Call:
lm(formula = RECOMMENDATION ~ IMPACT + SUBSTANCE + APPROPRIATENESS +
MEANINGFUL_COMPARISON + SOUNDNESS_CORRECTNESS + ORIGINALITY +
CLARITY + REVIEWER_CONFIDENCE, data = train_conll)
Coefficients:
(Intercept) IMPACT SUBSTANCE
-2.03045 0.04700 0.16525
APPROPRIATENESS MEANINGFUL_COMPARISON SOUNDNESS_CORRECTNESS
0.16999 0.34972 0.45175
ORIGINALITY CLARITY REVIEWER_CONFIDENCE
0.21587 -0.01760 0.02112
RECOMMENDATION_model_points <- get_regression_points(RECOMMENDATION_model)
model_points <- get_regression_points(RECOMMENDATION_model)
ggplot(model_points, aes(x = RECOMMENDATION_hat)) +
geom_histogram(bins = 20) +
labs(x = "Residual", y = "Count")
train_rec <-
recipe(RECOMMENDATION ~ IMPACT+SUBSTANCE+APPROPRIATENESS+MEANINGFUL_COMPARISON+SOUNDNESS_CORRECTNESS+
ORIGINALITY+CLARITY+REVIEWER_CONFIDENCE
,data = train) %>%
step_naomit(everything(), skip = TRUE) %>%
step_novel(all_nominal(), -all_outcomes()) %>%
step_normalize(all_numeric(), -all_outcomes(),
-longitude, -latitude) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_corr(all_predictors(), threshold = 0.7, method = "spearman")
//Random Forest
library(ranger)
rf_spec <-
rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
rf_wflow <-
workflow() %>%
add_recipe(train_rec) %>%
add_model(rf_spec)
cv_folds <-
vfold_cv(train,
v = 10,
strata = RECOMMENDATION)
cv_folds
# 10-fold cross-validation using stratification
log_res <-
rf_wflow %>%
fit_resamples(
resamples = cv_folds,
metrics = metric_set(
recall, precision, f_meas,
accuracy, kap,
roc_auc, sens, spec),
control = control_resamples(
save_pred = TRUE)
)
x Fold01: preprocessor 1/1: Error in `chr_as_location...
x Fold02: preprocessor 1/1: Error in `chr_as_location...
x Fold03: preprocessor 1/1: Error in `chr_as_location...
x Fold04: preprocessor 1/1: Error in `chr_as_location...
x Fold05: preprocessor 1/1: Error in `chr_as_location...
x Fold06: preprocessor 1/1: Error in `chr_as_location...
x Fold07: preprocessor 1/1: Error in `chr_as_location...
x Fold08: preprocessor 1/1: Error in `chr_as_location...
x Fold09: preprocessor 1/1: Error in `chr_as_location...
x Fold10: preprocessor 1/1: Error in `chr_as_location...
Warning: All models failed. See the `.notes` column.